library(tidyverse)
library(rvest)
library(knitr)
library(leaflet)
#install.packages("rgdal",repos = "http://cran.us.r-project.org")
library(rgdal)
library(lubridate)
#library(dplyr)
#library(plyr)
library(plotly)
col1 = "#d8e1cf" 
col2 = "#438484"
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d

Load the dataset NYPD shooting incident

shooting_initial = 
  read_csv("./data/NYPD_Shooting.csv") %>%
  janitor::clean_names()
## Rows: 23585 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): OCCUR_DATE, BORO, LOCATION_DESC, PERP_AGE_GROUP, PERP_SEX, PERP_R...
## dbl   (7): INCIDENT_KEY, PRECINCT, JURISDICTION_CODE, X_COORD_CD, Y_COORD_CD...
## lgl   (1): STATISTICAL_MURDER_FLAG
## time  (1): OCCUR_TIME
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

check null

shooting_initial %>%
  summarise_all(~ sum(is.na(.)))
## # A tibble: 1 × 19
##   incident_key occur_date occur_time  boro precinct jurisdiction_code
##          <int>      <int>      <int> <int>    <int>             <int>
## 1            0          0          0     0        0                 2
## # … with 13 more variables: location_desc <int>, statistical_murder_flag <int>,
## #   perp_age_group <int>, perp_sex <int>, perp_race <int>, vic_age_group <int>,
## #   vic_sex <int>, vic_race <int>, x_coord_cd <int>, y_coord_cd <int>,
## #   latitude <int>, longitude <int>, lon_lat <int>

For col boro

shooting = shooting_initial %>% 
  mutate(boro = as.factor(boro)) %>%
  mutate(location_desc = replace_na(location_desc, "NONE")) %>%
  mutate(location_desc = as.factor(location_desc)) %>%
  separate(occur_date, into = c("month", "day", "year")) %>% 
  mutate(month = as.numeric(month)) %>% 
  arrange(year, month) %>% 
 # mutate(month = month.name[month]) %>% 
  mutate(year = as.character(year)) %>% 
  mutate(boro = tolower(boro)) %>% 
  mutate(boro = if_else(boro == "staten island", "staten_island", boro)) %>% 
  rename(borough = boro) %>% 
    mutate(date = str_c(month, day, year, sep = "/")) %>% 
  select(incident_key, date, everything())

Next, clean the COVID-19 case count data

Importing COVID-19 case count data

covid_counts = read.csv("./data/COVID19_data.csv", sep = ";") %>% as_tibble()

The clean dataset contains only day-by-day COVID-19 case count for each borough and the total case count in NYC of a particular day.

clean_covid = covid_counts %>% 
   janitor::clean_names() %>% 
   rename(date = date_of_interest) %>% 
   select(date, contains("case_count")) %>% 
   select(-contains(c("probable_case_count", "case_count_7day_avg", "all_case_count_7day_avg"))) %>%
   separate(date, into = c("month", "day", "year")) %>% 
   mutate_all(as.character) %>% 
   mutate_if(is.character, gsub, pattern = ",", replacement = "") %>% 
   mutate_if(is.character, as.numeric) %>%
   pivot_longer(
    cols = bx_case_count:si_case_count,
    names_to = "borough",
    values_to = "borough_case_count"
  ) %>% 
  mutate(borough = gsub("_case_count", "", borough)) %>% 
  mutate(borough = recode(borough, "bx" = "bronx","bk" = "brooklyn","mn" = "manhattan","si" = "staten_island","qn" = "queens")) %>% 
  relocate(case_count, .after = borough_case_count) %>% 
  rename(total_case_count = case_count) %>% 
  mutate(date = str_c(month, day, year, sep = "/")) %>% 
  select(date, everything())

head(clean_covid)
## # A tibble: 6 × 7
##   date      month   day  year borough       borough_case_count total_case_count
##   <chr>     <dbl> <dbl> <dbl> <chr>                      <dbl>            <dbl>
## 1 2/29/2020     2    29  2020 bronx                          0                1
## 2 2/29/2020     2    29  2020 brooklyn                       0                1
## 3 2/29/2020     2    29  2020 manhattan                      1                1
## 4 2/29/2020     2    29  2020 queens                         0                1
## 5 2/29/2020     2    29  2020 staten_island                  0                1
## 6 3/1/2020      3     1  2020 bronx                          0                0

Make the heatmap based on time

shooting_heatmap = shooting_initial %>%
  mutate(occur_date = as.Date(occur_date,'%m/%d/%Y')) %>%
  mutate(occur_date = weekdays(occur_date)) %>%
  separate(occur_time, into = c("hour", "minute", "second")) %>%
  mutate(hour = as.factor(hour)) %>%
  select(incident_key, occur_date, hour) %>%
  mutate(occur_date = as.factor(occur_date),
         occur_date = fct_relevel(occur_date, "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday" , "Saturday"))

dayHour = plyr::ddply(shooting_heatmap, c( "hour", "occur_date"), summarise, N = length(incident_key))
attach(dayHour)

heatmap = ggplot(dayHour, aes(hour, occur_date)) + 
  geom_tile(aes(fill = N),colour = "white", na.rm = TRUE) +
  scale_fill_gradient(low = col1, high = col2) +  
  guides(fill = guide_legend(title = "Total Shooting Cases")) +
  theme_bw() + 
  theme_minimal() + 
  labs(title = "Time Based Heatmap",
       x = "Shooting Cases Per Hour", y = "Day of Week") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

heatmap

According to the result of this heatmap, the midnight of weekends(Sunday and Saturday) have the highest risk of shooting cases. Additionally, daytime between 7 in the morning and 19 in the evening seems to have lower shooting cases than the other time of the day.

Shooting Incidents Across Time

Distribution of Shooting Case of Years

shooting_year = shooting %>%
  group_by(year) %>% 
  summarise(n_obs = n())
#visualization shooting incidence trend
shooting_year %>% 
  plot_ly( x = ~year, y = ~n_obs, type = "scatter", mode = "lines+markers") %>% 
  layout(title = "Shooting Incidence Trend from 2006 to 2020",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Frequency"))

Distribution of Shooting Case of Months

shooting_month = shooting %>%
  mutate(month = as.factor(month)) %>% 
  group_by(month) %>% 
  summarise(n_obs = n())

shooting_month %>% 
  plot_ly(x = ~month, y = ~n_obs, color = ~month, type = "bar") %>% 
  layout( title = "The Distribution of Shooting Incidence by Month",
          xaxis = list(title = "Month"),
          yaxis = list(title = "Frequency"))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

Distribution of Shooting Case of Time Range Across day

shooting_time = shooting %>% 
##format the occur_time variable to only hours.
  mutate(occur_time_hour = format(as.POSIXct(occur_time), format = "%H")) %>% 
  mutate(occur_time_hour = as.numeric(occur_time_hour)) %>% 
  group_by(occur_time_hour) %>% 
  summarise(case_numb = n())

#divide day time to 4 groups: 0-6;6-12;12-18;18-24
shooting_time = shooting_time %>% 
  mutate(occur_time_range = case_when(
    occur_time_hour >= 0 & occur_time_hour < 6 ~ "0-6",
    occur_time_hour >= 6 & occur_time_hour < 12 ~ "6-12",
    occur_time_hour >= 12 & occur_time_hour < 18 ~ "12-18",
    occur_time_hour >= 18 & occur_time_hour < 24 ~ "18-24"))
shooting_time = shooting_time %>% 
  mutate(occur_time_range = factor(occur_time_range, levels = c("0-6","6-12","12-18","18-24")))

ggplot(shooting_time, aes(x = occur_time_range, y = case_numb, fill = occur_time_range)) + geom_col(alpha = 1) + labs(x = "Occur Time Range",
       y = "Frequency",
       title = "Distribution of Shooting Case by Day")

Distribution of shooting case across month in 2020

(since the covid-19 starts from March 2020, to see if there any relation between shooting and Covid.)

shooting_2020 = shooting %>%
  filter(year == "2020") %>% 
  mutate(month = as.factor(month)) %>% 
  group_by(month) %>% 
  summarise(n_obs = n())

ggplot(shooting_2020, aes(x = month, y = n_obs, fill = month)) + geom_col(alpha = 1) + labs(
  x = "Month",
  y = "Frequency",
  title = "Distribution of Shooting Case by Day in NYC")

Shooting Incidents Across Space

This is an interactive map of shooting incidents from 2006 to 2021 in NYC. Incidents’ details will be displayed after clicking the icon. GIS data of Boroughs’ boundaries were obtained from NYC Open Data.

Borough Boundaries

nyc_boro = readOGR("./data/Borough_Boundaries/geo_export_2204bc6b-9c17-46ed-8a67-7245a1e15877.shp", layer = "geo_export_2204bc6b-9c17-46ed-8a67-7245a1e15877")
## Warning in OGRSpatialRef(dsn, layer, morphFromESRI = morphFromESRI, dumpSRS =
## dumpSRS, : Discarded datum WGS84 in Proj4 definition: +proj=longlat +ellps=WGS84
## +no_defs
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/sylvia/Desktop/P8105/gun_violence_nyc.github.io/data/Borough_Boundaries/geo_export_2204bc6b-9c17-46ed-8a67-7245a1e15877.shp", layer: "geo_export_2204bc6b-9c17-46ed-8a67-7245a1e15877"
## with 5 features
## It has 4 fields
shooting %>% 
#  filter(year %in% c("2020", "2021")) %>%
  mutate_at(c("perp_age_group", "perp_sex", "perp_race"), funs(ifelse(is.na(.), "unknown", .))) %>% 
  mutate(labels = str_c("<b>Incident Key: </b>", incident_key, 
                    "<br>", "<b>Date: </b>", date,
                    "<br>", "<b>Borough: </b>", borough,
                    "<br>", "<b>Murdered: </b>", statistical_murder_flag,
                    "<br>", "<b>Perpetrator's Race: </b>", perp_race,
                    "<br>", "<b>Victim's Race: </b>", vic_race,
                    "<br>", "<b>Perpetrator's Age: </b>", perp_age_group,
                    "<br>", "<b>Victim's Age: </b>", vic_age_group
                    )) %>% 
  leaflet() %>% 
  addTiles() %>% 
  addProviderTiles("CartoDB.Positron") %>% 
  addMarkers(lng = ~longitude, lat = ~latitude, popup = ~labels,
             clusterOptions = markerClusterOptions()) %>% 
  addPolygons(data = nyc_boro,
              weight = 0.85,
              label = ~nyc_boro@data$boro_name)
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Demographic Characteristics

Based on the shooting dataset we obtained, we could study the demographic characteristics of the perpetrators and victims to see whether there exists some potential patterns/correlations or not.

perpetrators’ characteristics

First, let’s look at perpetrators’ demographic information.

According to the previously checked numbers of null values in each column, there are 8295 values missing in perp_age_group and 8261 values missing in both perp_sex and perp_race. The information missing maybe not available or unknown at the time of the report and should be considered as either “Unknown/Not Available/Not Reported.”

# group by perp_age_group and do a little summary
sum_perp_age = 
  shooting %>% 
  mutate(
    perp_age_group = ifelse(is.na(perp_age_group), "UNKNOWN", perp_age_group),
    perp_age_group = as.factor(perp_age_group)
  ) %>% 
  # count distinct incident case
  distinct(incident_key, .keep_all = TRUE) %>% 
  group_by(perp_age_group) %>% 
  arrange(perp_age_group) %>%
  relocate(perp_age_group) %>% 
  # filter the abnormal value
  filter(!(perp_age_group %in% c("1020", "224", "940"))) %>% 
  summarize(number = n())

# Visualize by a pie chart
sum_perp_age %>% 
  mutate(perc = scales::percent(number/sum(number))) %>% 
  ggplot(aes(x = "", y = perc, fill = perp_age_group)) +
  geom_bar(width = 1, stat = "identity") +
  geom_text(aes(label = perc),
            position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) +
  scale_fill_brewer(palette = "Pastel1") +
  theme_void() + 
  guides(fill = guide_legend(title = "Perpetrator's Age Group")) +
  labs(title = "Pie Chart for Different Perpetrator Age Groups") +
  theme(legend.position = "right")

Therefore, except those with unknown age group, most perpetrators are in the age group 18-24 and 24-44, which takes up 39.4% in total.

# Now look at perpetrators' sex and race
perp_sex_race = 
  shooting %>% 
  distinct(incident_key, .keep_all = TRUE) %>% 
  mutate(
    perp_sex = ifelse(is.na(perp_sex), "U", perp_sex),
    perp_race = ifelse(is.na(perp_race), "UNKNOWN", perp_race)
  ) %>% 
  mutate(perp_race = fct_infreq(as.factor(perp_race))) %>%
  group_by(perp_sex, perp_race) %>% 
  relocate(perp_sex, perp_race) %>% 
  arrange(perp_sex, perp_race) %>% 
  summarise(num = n()) %>% 
  mutate(perc = round(num/sum(num), digits = 2))
## `summarise()` has grouped output by 'perp_sex'. You can override using the `.groups` argument.
# Get the position for the label of the pie charts
position1 = 
  perp_sex_race %>% 
  filter(perp_sex != "U") %>%
  mutate(csum = rev(cumsum(rev(perc*100))), 
         pos = perc*100/2 + lead(csum, 1),
         pos = if_else(is.na(pos), perc*100/2, pos))

# pie chart
perp_sex_race %>% 
  filter(perp_sex != "U") %>%
  ggplot(aes(x = "" , y = perc*100, fill = fct_inorder(perp_race))) +
  geom_col(width = 1, color = 1) +
  coord_polar(theta = "y") +
  scale_fill_brewer(palette = "Pastel1") +
  ggrepel::geom_label_repel(data = position1,
                   aes(y = pos, label = paste0(perc*100, "%")),
                   size = 4.5, nudge_x = 1, show.legend = FALSE) +
  guides(fill = guide_legend(title = "Perpetrator's Race")) +
  theme_void() +
  labs(title = "Pie Chart For Different Perpetrator Race in Different Sex") +
  facet_grid(.~perp_sex)

# Or we can try to visualize using bar chart
perp_sex_race %>% 
  ggplot(aes(x = perp_race, y = num, fill = perp_sex)) +
  geom_bar(
    stat = "identity", position=position_dodge()
  ) +
  labs(
    x = "Perpetrator's Race",
    y = "Number of Perpetrators",
    title = "Number of Perpetrators in Different Races"
  ) +
  coord_flip() +
  guides(fill = guide_legend(title = "Perpetrator's Race")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "right")

From the perp_sex_race chart and the pie charts, most of the perpetrators are males (98.11%), and two races with highest rate are Black (M: 74%, F:67%) and White Hispanic(M: 13%, F: 19%).

Victims’ characteristics

There is no value missing in the victims’ demographic characteristics: vic_age_group, vic_sex and vic_race.

What age groups are most victims in?

vic_age_group =
  shooting %>% 
  mutate(as.factor(vic_age_group)) %>%
  group_by(vic_age_group) %>% 
  relocate(vic_age_group) %>% 
  arrange(vic_age_group) %>% 
  summarise(num = n())

vic_age_group %>% 
  mutate(perc = scales::percent(num/sum(num))) %>% 
  ggplot(aes(x = "", y = perc, fill = vic_age_group)) +
  geom_bar(width = 1, stat = "identity") +
  geom_text(aes(label = perc),
            position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) +
  scale_fill_brewer(palette = "Pastel1") +
  theme_void() + 
  guides(fill = guide_legend(title = "Victim's Age Group")) +
  theme(legend.position = "right") +
  labs(
    x = "Victim's Age Group",
    y = "Number of Victims",
    title = "Number of Victims in Different Age Groups"
  )

As shown above, most victims are in the age group 18-24 and 25-44.

For victims’ sex and race, similarly, first check the bar plot and pie chart.

# The sex and race characteristics of victims
vic_sex_race = 
  shooting %>% 
  mutate(vic_race = fct_infreq(as.factor(vic_race))) %>% 
  group_by(vic_sex, vic_race) %>% 
  relocate(vic_sex, vic_race) %>% 
  arrange(vic_sex, vic_race) %>% 
  summarise(num = n()) %>% 
  mutate(perc = round(num/sum(num), digits = 2))
## `summarise()` has grouped output by 'vic_sex'. You can override using the `.groups` argument.
# bar plot
vic_sex_race %>% 
  ggplot(aes(x = vic_race, y = num, fill = vic_sex)) +
  geom_bar(
    stat = "identity", position=position_dodge()
  ) +
  labs(
    x = "Victim's Race",
    y = "Number of Victims",
    title = "Number of Victims in Different Races"
  ) +
  coord_flip() +
  guides(fill = guide_legend(title = "Victim's Race")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "right")

# To see the percentage of different race, we can draw a pie chart

# Let's get the position for the label of the pie charts first
position2 = 
  vic_sex_race %>% 
  filter(vic_sex != "U") %>%
  mutate(csum = rev(cumsum(rev(perc*100))), 
         pos = perc*100/2 + lead(csum, 1),
         pos = if_else(is.na(pos), perc*100/2, pos))

# pie chart
vic_sex_race %>% 
  filter(vic_sex != "U") %>%
  ggplot(aes(x = "" , y = perc*100, fill = fct_inorder(vic_race))) +
  geom_col(width = 1, color = 1) +
  coord_polar(theta = "y") +
  scale_fill_brewer(palette = "Pastel1") +
  ggrepel::geom_label_repel(data = position2,
                   aes(y = pos, label = paste0(perc*100, "%")),
                   size = 4.5, nudge_x = 1, show.legend = FALSE) +
  guides(fill = guide_legend(title = "Victim's Race")) +
  theme_void() +
  labs(title = "Pie Chart for Different Victim Race in Different Sex") +
  facet_grid(.~vic_sex)

Most of the victims are Black (M: 72%, F: 69%), and the number of male victims in shooting cases is much more than female victims.

COVID and shooting relations

Note: The date of shooting and COVID datasets only overlap on February-December 2020, so we do the analysis base on the overlap.

shooting_mini =
  shooting %>% 
  filter(year == "2020") %>% 
  select(c("date", "incident_key", "borough"))

shooting_covid = 
  merge(x = shooting_mini, y = clean_covid, by = c("date", "borough")) %>% 
  relocate("date", "month", "day", "year", everything()) %>% 
  group_by(date) %>% 
  add_count(borough, name = "borough_n_victim") %>% # victim number equals to the count of incident_key (includind duplicate)
  distinct() %>% 
  add_count(borough, name = "borough_n_shooting") %>% # shooting number equals to the count of distinct incident_key
  select(-incident_key) %>% 
  distinct() %>% 
  add_count(date, wt = borough_n_victim, name = "total_n_victim") %>% 
  add_count(date, wt = borough_n_shooting, name = "total_n_shooting") %>% 
  mutate(
    borough = recode(borough, 
                     "bronx" = "Bronx",
                     "brooklyn" = "Brooklyn",
                     "manhattan" = "Manhattan",
                     "queens" = "Queens",
                     "staten_island" = "Staten Island")
  )

Visualize Distribution

First we want to see if there is an underlying association between total COVID cases and total shootings.

shooting_covid %>% 
  ggplot(aes(x = total_case_count, y = total_n_shooting, group = borough, color = borough)) +
  geom_point() +
  labs(
    title = "Distribution of Shooting Cases Against COVID Cases Within Each Day",
    x = "Total COVID Cases",
    y = "Total Shooting Cases"
  ) +
  guides(fill = guide_legend(title = "Borough")) +
  theme(legend.position = "right")

See separately for each borough.

shooting_covid %>% 
  ggplot(aes(x = borough_case_count, y = borough_n_shooting, group = borough, color = borough)) +
  geom_point() +
  labs(
    title = "Distribution of Shooting Cases Against COVID Cases Within Each Day",
    x = "Total COVID Cases",
    y = "Total Shooting Cases"
  ) + 
  facet_wrap(~ borough) +
  guides(fill = guide_legend(title = "Borough")) +
  theme(legend.position = "right")

We can see that for each borough, there is a general trend that higher number of shooting cases is associated with fewer COVID cases, whereas the number of shooting cases decreases when COVID case increases. This trend does not manifest in Staten Island, which may due to the limited size of data.

Then explore by month.

shooting_covid %>% 
  ggplot(aes(x = total_case_count, y = total_n_shooting, group = month, color = month)) +
  geom_point() +
  labs(
    title = "Distribution of Shooting Cases Against COVID Cases Within Each Day",
    x = "Total COVID Cases",
    y = "Total Shooting Cases"
  ) +
  facet_wrap(~ month) +
  guides(fill = guide_legend(title = "Month")) +
  theme(legend.position = "right")

We can see from the above plot that there is no general pattern of the association between total shooting and total COVID cases. But for the months in the middle of the year, especially in June, July and August, we can see that the number shooting cases is high when there are few COVID cases, which is consistent with the former analysis.

Fit a linear regression model for total COVID case with total shooting as predictor.

total_lm = lm(total_n_shooting ~ total_case_count, data = shooting_covid) 

broom::tidy(total_lm) %>% 
  knitr::kable()
term estimate std.error statistic p.value
(Intercept) 7.1159313 0.1860307 38.25137 0
total_case_count -0.0011568 0.0001142 -10.13164 0
set.seed(100)
par(mfrow = c(2,2))
plot(total_lm)

We can see from the Residuals vs Fitted plot that the residuals is not equally distributed around the 0 horizontal line. In fact, the residuals has a pattern of decrease, indicating that the model has high goodness of fit, but violates the assumption of the normal distribution of the residual. Besides, from the Normal Q-Q plot we can see that most of the data fit the line when the theoretical quantile is under 2, indicating the data might have a normal distribution in a certain range.